In [1]:
import pandas as pd
import pickle
import numpy as np

# Load the bar review dataset 
review = pd.read_pickle('../output/bar_reviews_cleaned_and_tokenized.pickle')

In [2]:
from itertools import chain
from collections import OrderedDict
reviews_merged = OrderedDict()

# Flatten the reviews, so each review is just a single list of words.
n_reviews = -1

for bus_id in set(review.business_id.values[:n_reviews]):
    # This horrible line first collapses each review of a corresponding business into a list
    # of lists, and then collapses the list of sentences to a long list of words
    reviews_merged[bus_id] = " ".join(list(chain.from_iterable( 
                                    chain.from_iterable( review.cleaned_tokenized[review.business_id==bus_id] )))
                                     )
    
docs = reviews_merged.values()

In [3]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


n_samples = -1
n_features = 5000
n_top_words = 10

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.75, min_df=2, max_features=n_features)
t0 = time()
tf = tf_vectorizer.fit_transform(docs[:n_samples])
print("done in %0.3fs." % (time() - t0))


Extracting tf features for LDA...
done in 16.689s.

Choosing N-Topics


In [ ]:
perplexity = [] 
for n_topics in range(1,40):
    print("N Topics %i"%n_topics)
    print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
          % (n_samples, n_features))
    lda = LatentDirichletAllocation(doc_topic_prior=topic_prior=topic_prior=7/n_topics, n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=10.,
                                    random_state=0, n_jobs=6)
    t0 = time()
    doc_topics = lda.fit(tf)
    print("done in %0.3fs." % (time() - t0))
    
    perplexity.append(lda.perplexity(tf))


N Topics 1
Fitting LDA models with tf features, n_samples=-1 and n_features=5000...
done in 3.122s.
N Topics 2
Fitting LDA models with tf features, n_samples=-1 and n_features=5000...
done in 11.230s.
N Topics 3
Fitting LDA models with tf features, n_samples=-1 and n_features=5000...
done in 25.444s.
N Topics 4
Fitting LDA models with tf features, n_samples=-1 and n_features=5000...

In [13]:
# Save the model 
with open('../output/LDA_perplexity.pickle', 'wb') as f: 
    pickle.dump(perplexity, f, )

Choosing Number of topics via elbow


In [10]:
import pickle
perplexity = pickle.load( open('../output/LDA_perplexity.pickle', 'rb' ))

plt.plot(range(1,30), np.log2(perplexity), marker='o', markersize=2, label='Business LDA')
plt.xlabel('Number of Topics')
plt.ylabel('$\log_2$(Perplexity)')
plt.legend(frameon=False)
plt.savefig('../images/LDA_num_topics_elbow.png')



In [ ]:


In [28]:
# This is a vector for each company in topic space
doc_topic_distr = lda.transform(tf)

In [50]:
print(doc_topic_distr.shape)
doc_topics = {'topics': reviews_merged.keys(), 'doc_topic_dist':doc_topic_distr,}
import cPickle as pickle

with open('../output/LDA_doc_topic_list.pickle', 'wb') as f: 
    pickle.dump(doc_topics, f, )


(4602, 20)

In [10]:
# Plot the sum of topics.
plt.bar(np.arange(len(np.sum(doc_topic_distr, axis=0))), np.average(doc_topic_distr, axis=0), color='steelblue')



NameErrorTraceback (most recent call last)
<ipython-input-10-024f7b589f23> in <module>()
      1 # Plot the sum of topics.
----> 2 plt.bar(np.arange(len(np.sum(doc_topic_distr, axis=0))), np.average(doc_topic_distr, axis=0), color='steelblue')

NameError: name 'doc_topic_distr' is not defined

In [ ]:


In [ ]:


In [11]:
from scipy.stats import beta

In [40]:
for i, a in enumerate(np.logspace(-.2,0,4)):
    b = 1
    x = np.linspace(beta.ppf(0.01, a, b), beta.ppf(0.99, a, b), 100)
    plt.plot(x, beta.pdf(x, a, b), color='r', lw=2, alpha=(i+2)/6., label='beta(%1.2f, 1)'%a)
plt.legend(frameon=False)
plt.yscale('log')
plt.ylim(.5, 10)
# plt.xscale('log')


Out[40]:
(0.5, 10)

In [ ]: